Code blocks: Initial data processing

All code used for the analyses can be found throughout this report by expanding the code-blocks in the .html-file on the right-hand side within the appropriate tabs, or through the associated .Rmd file, where it can also be run. You will have to install all the necessary packages yourself though, which can be non-trivial.

Libraries

suppressPackageStartupMessages({
  library('biomaRt') # downloading gene data
  library("DT") # for pretty printing of paged tables
  library("limma") # RNA-seq analysis
  library("edgeR") # RNA-seq analysis
  library("DESeq2") # RNA-seq analysis
  library("here") # Better structure and relative paths
  library("GO.db") # downloading gene data
  library("org.Mm.eg.db")
  library("cbmr") # in-house convenience functions
  library("AnnotationDbi")
  library("ggrepel")
  # library("sva")
  # library("DESeq2")
  library("RUVSeq")
  library("tidyverse") # general data manipulation
})

full_script_name <- rstudioapi::getSourceEditorContext()$path
script_name <- basename(full_script_name)
data_dir <- paste0(tools::file_path_sans_ext(full_script_name),"_data/")
dir.create(data_dir, showWarnings = F)

General data processing

# load data
g_counts_raw <- readRDS("../data-raw/salmon.merged.gene_counts.rds")
rna_counts_all <- g_counts_raw %>% assays() %>% .$counts

# fix sample IDS in count matrix
colnames(rna_counts_all) <- 
  sub(pattern = "X[0-9]{4}_", replacement = "", x = colnames(rna_counts_all)) %>% 
  sub(pattern = "_S[0-9]{1,2}", replacement = "", x=.)

# fix sample IDS in meta data sheet
meta_data_sheet <- readxl::read_excel("0177_metadata.xlsx") %>%
  transmute(`Sample.ID`=sub(pattern = ".....", replacement = "", x=`Sample ID (Library ID)`), 
            gRNA = factor(gsub(x = Condition1, pattern = "g|gRNA_", replacement = ""), levels=c("EV", "PTPRK", "PHOX2B")),
            Media = factor(gsub(x = Condition2, pattern = "_media", replacement = ""), levels=c("proliferation","differentiation")),
            Extraction=factor(`Extraction pool (A,B,C)`, levels=c("A", "B", "C", "D", "E")),
            Replicate=factor(gsub(`Condition 3`, pattern="replicate_", replacement = ""), levels=c("1","2","3")),
            Observed_batch=factor(Comments)) %>%
  as.data.frame()

#Create DGE list of relevant samples
rna_counts <- rna_counts_all[, as.character(c(1:18))]

y <- DGEList(rna_counts, 
             group = meta_data_sheet$Observed_batch,
             samples=meta_data_sheet)

design <-  model.matrix(~0 + group +Media*gRNA, data=y$samples)
colnames(design) <- str_replace(string = colnames(design), pattern = ":", replacement = "_")

idx_expressed <- filterByExpr(y, design = design)
y <- y[idx_expressed, ]
y <- calcNormFactors(y)

# Calculate CPM and print
log_cpm <- cpm(y, log=T)
saveRDS(object =log_cpm, file=paste0(data_dir, 
                                           "log_CPM_values_ensembl_ID_by_sample.RDS"))
write_tsv(x = as.data.frame(log_cpm),
                  file=paste0(data_dir, "log_CPM_values_ensembl_ID_by_sample.tsv"))

Get gene info

# Get Description, external gene name via Ensembl/biomaRt
# mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
# saveRDS(mart, file=paste0("shared_data/", "mouse_mart.RDS"))
mart <-  readRDS(file = paste0("shared_data/", "mouse_mart.RDS"))

gene_info_list <- getBM(filters= "ensembl_gene_id", 
                        attributes= c("ensembl_gene_id", "external_gene_name","description"),
                        values=rownames(y),
                        mart=mart)

# get names via org.Mm.eg.db
gene_names <- mapIds(x = org.Mm.eg.db,
                     keys = rownames(y),
                     column = "SYMBOL",
                     keytype = "ENSEMBL", 
                     multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
# uncorrected
log_cpm_with_names <- log_cpm
rownames(log_cpm_with_names) <- gene_names

saveRDS(object=log_cpm_with_names, file = paste0(data_dir, "log_CPM_values_gene_name_by_sample.RDS"))

write_tsv(x = as.data.frame(log_cpm_with_names),
                  file =paste0(data_dir, "log_CPM_values_gene_name_by_sample.tsv"))

EdgeR differential gene expression

y <- estimateDisp(y, design = design)
fit <- glmQLFit(y, design = y$design, robust = TRUE)

contrast_matrix <- makeContrasts(batch_effect=groupA-groupB,
                                 levels=design)

all_genes_test_full_info <- list()
for (contrast in colnames(contrast_matrix)) {
  all_genes_test <- edgeR_tester(contrast_matrix = contrast_matrix, contrast=contrast, efit = fit)
  all_genes_test_full_info[[contrast]] <-  merge(x=all_genes_test, 
                                                 y=gene_info_list, 
                                                 by.x = "ENSEMBL", by.y="ensembl_gene_id",
                                                 all.x=T)
}

# print tsv-tables:
for (contrast in colnames(contrast_matrix)) {
  all_genes_test_full_info[[contrast]] %>% write_tsv(file=paste0(data_dir, contrast,"_DGE_analysis.tsv"))
}

Gene ontology analysis

GO_terms <- get_enrichment_terms(org_db = org.Mm.eg.db, ensembl_ids = rownames(y),
                                 min_genes = 5, max_genes = 500)$BP
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
ontology_tests <- run_ontology_tests(GO_terms, 
                                     y, 
                                     contrast_matrix = contrast_matrix, 
                                     fun=limma::camera)
## 67.1286799356878% of genes have an annotation
for (contrast in colnames(contrast_matrix)) {
  ontology_tests[[contrast]] %>% write_tsv(file=paste0(data_dir, contrast,"_GO_analysis.tsv"))
}

Session-info

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_0.5.1               stringr_1.4.0              
##  [3] dplyr_1.0.7                 purrr_0.3.4                
##  [5] readr_2.1.1                 tidyr_1.1.4                
##  [7] tibble_3.1.6                tidyverse_1.3.1            
##  [9] RUVSeq_1.28.0               EDASeq_2.28.0              
## [11] ShortRead_1.52.0            GenomicAlignments_1.30.0   
## [13] Rsamtools_2.10.0            Biostrings_2.62.0          
## [15] XVector_0.34.0              BiocParallel_1.28.3        
## [17] ggrepel_0.9.1               ggplot2_3.3.5              
## [19] cbmr_0.0.0.9001             org.Mm.eg.db_3.14.0        
## [21] GO.db_3.14.0                AnnotationDbi_1.56.2       
## [23] here_1.0.1                  DESeq2_1.34.0              
## [25] SummarizedExperiment_1.24.0 Biobase_2.54.0             
## [27] MatrixGenerics_1.6.0        matrixStats_0.61.0         
## [29] GenomicRanges_1.46.1        GenomeInfoDb_1.30.0        
## [31] IRanges_2.28.0              S4Vectors_0.32.3           
## [33] BiocGenerics_0.40.0         edgeR_3.36.0               
## [35] limma_3.50.0                DT_0.20                    
## [37] biomaRt_2.50.1             
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.4.1        aroma.light_3.24.0    
##   [4] BiocFileCache_2.2.0    splines_4.1.1          digest_0.6.29         
##   [7] foreach_1.5.1          htmltools_0.5.2        fansi_0.5.0           
##  [10] magrittr_2.0.1         memoise_2.0.1          tzdb_0.2.0            
##  [13] annotate_1.72.0        modelr_0.1.8           vroom_1.5.7           
##  [16] R.utils_2.11.0         prettyunits_1.1.1      jpeg_0.1-9            
##  [19] colorspace_2.0-2       rvest_1.0.2            blob_1.2.2            
##  [22] rappdirs_0.3.3         haven_2.4.3            xfun_0.28             
##  [25] crayon_1.4.2           RCurl_1.98-1.5         jsonlite_1.7.2        
##  [28] genefilter_1.76.0      survival_3.2-13        iterators_1.0.13      
##  [31] glue_1.5.1             gtable_0.3.0           zlibbioc_1.40.0       
##  [34] DelayedArray_0.20.0    scales_1.1.1           DBI_1.1.1             
##  [37] Rcpp_1.0.7             xtable_1.8-4           progress_1.2.2        
##  [40] bit_4.0.4              htmlwidgets_1.5.4      httr_1.4.2            
##  [43] RColorBrewer_1.1-2     ellipsis_0.3.2         pkgconfig_2.0.3       
##  [46] XML_3.99-0.8           R.methodsS3_1.8.1      sass_0.4.0            
##  [49] dbplyr_2.1.1           locfit_1.5-9.4         utf8_1.2.2            
##  [52] tidyselect_1.1.1       rlang_0.4.12           munsell_0.5.0         
##  [55] cellranger_1.1.0       tools_4.1.1            cachem_1.0.6          
##  [58] cli_3.1.0              generics_0.1.1         RSQLite_2.2.9         
##  [61] broom_0.7.10           evaluate_0.14          fastmap_1.1.0         
##  [64] yaml_2.2.1             fs_1.5.2               knitr_1.36            
##  [67] bit64_4.0.5            KEGGREST_1.34.0        R.oo_1.24.0           
##  [70] xml2_1.3.3             compiler_4.1.1         rstudioapi_0.13       
##  [73] filelock_1.0.2         curl_4.3.2             png_0.1-7             
##  [76] reprex_2.0.1           statmod_1.4.36         geneplotter_1.72.0    
##  [79] bslib_0.3.1            stringi_1.7.6          GenomicFeatures_1.46.1
##  [82] lattice_0.20-45        Matrix_1.4-0           vctrs_0.3.8           
##  [85] pillar_1.6.4           lifecycle_1.0.1        jquerylib_0.1.4       
##  [88] data.table_1.14.2      bitops_1.0-7           rtracklayer_1.54.0    
##  [91] R6_2.5.1               BiocIO_1.4.0           latticeExtra_0.6-29   
##  [94] hwriter_1.3.2.1        codetools_0.2-18       MASS_7.3-54           
##  [97] assertthat_0.2.1       rprojroot_2.0.2        rjson_0.2.20          
## [100] withr_2.4.3            GenomeInfoDbData_1.2.7 parallel_4.1.1        
## [103] hms_1.1.1              grid_4.1.1             rmarkdown_2.11        
## [106] lubridate_1.8.0        restfulr_0.0.13

Initial assessment of data quality

n_samples <- nrow(meta_data_sheet)
n_genes <-nrow(y)
n_counts_per_sample <- colSums(rna_counts)
n_counts_total <-  sum(n_counts_per_sample)
n_counts_per_gene_per_sample <- (n_counts_total/n_genes)/n_samples

From the sequencing, we obtain 18037 genes that are expressed in a appreciable amount across the samples to be used for our downstream analyses. With a final, filtered transcript count of 317 million across the 18 samples, we have an average of 977 RNA-transcripts for each gene in each sample.

Check out the tabs below to see various QC-visualizations of the data:

Multiple dimensional scaling (MDS) heatmap

The images below shows heatmaps of the log-normalized distances between each possible sample-pair with more similar pairs having negative values, and less similar pairs having positive values. Here, we can see that the samples seem to cluster based on genotype, but not as much on treatment.

annotation_col_df <-  dplyr::select(meta_data_sheet, gRNA, Extraction)
rownames(annotation_col_df) <- meta_data_sheet$Sample.ID

annotation_row_df <-  dplyr::select(meta_data_sheet, Media)
rownames(annotation_row_df) <- meta_data_sheet$Sample.ID

plot_sample_heatmap(log_cpm,
                    method="MDS",
                    labels_col = meta_data_sheet$Sample.ID,
                    labels_row = meta_data_sheet$Sample.ID,
                    annotation_col = annotation_col_df[c("gRNA", "Extraction")],
                    annotation_row = annotation_row_df["Media"],
                    display_numbers=T,
                    fontsize_number=6,
                    main="Multiple dimensional scaling (MDS) heatmap"
)

Multi-dimensional scaling plot

The figures below show the location of the samples in reduced 2-dimensional space.

ggplot_mds(y = y, 
           dim_plot = c(1,2),
           size=1,
           colour_by = y$samples$Media) +
  ggtitle("MDS-plot colored by Media (Dimensions 1 & 2)") + 
  aes(label = rownames(y$samples)) +
  geom_label_repel()

ggplot_mds(y = y, 
           dim_plot = c(3,4),
           size=1,
           colour_by = y$samples$Media) +
  ggtitle("MDS-plot colored by Media (Dimensions 3 & 4)") + 
  aes(label = rownames(y$samples)) +
  geom_label_repel()

ggplot_mds(y = y, 
           dim_plot = c(1,2),
           colour_by = y$samples$gRNA,
           size=1) +
  ggtitle("MDS-plot colored by gRNA") + 
  aes(label = rownames(y$samples)) +
  geom_label_repel()

MD-plots

The plots below show the log2-fold change in gene expression for all genes in each sample, compared to the rest of the samples. Here, we see the typical distribution with only a few outliers.

withr::with_par(list(mfrow = c(2, 3)), {
  for (i in seq_len(ncol(y))) {
    edgeR::plotMD.DGEList(y, column = i,cex.lab=1)
    graphics::abline(h = 0)
  }
})

Analysis results

The investigated contrasts and their P-value distributions

Throughout the analysis-section, you will see tables containing the differential gene expression analysis results as well as the gene-ontology enrichment results for the different contrasts, or groupings, as outlined below:

  1. Untreated 5xFAD hemizygote vs WT-mice (hemi_control_vs_WT)
  2. JOP85-treated 5xFAD hemizygote vs WT-mice (PHOX2B_vs_EV_differentiation)
  3. JOP85-treated vs untreated 5xFAD hemizygote mice (PTPRK_vs_EV_proliferation)

Below is shown the resulting, unadjusted P-value distribution from differential gene expression testing for each contrast:

withr::with_par(list(mfrow = c(1,3)),
                {for (contrast in colnames(contrast_matrix)) {
                  hist(all_genes_test_full_info[[contrast]]$PValue,
                       main = contrast, xlab = "P-value") }}
)

In the following sections, you can order the rows in the tables as you wish by directly interacting with the tables. Similarly, you can search for specific genes or GO-terms, limit the data-ranges and download the data as excel and csv-files. (Copies of the full tables are already found in the corresponding data directory, since searching in the full tables here can be a bit slow).

The gene ontology analysis investigates if some gene ontology terms are over-represented among the up-regulated and down-regulated genes respectively to investigate if specific biological processes are up- or down-regulated.

PHOX2B vs EV with proliferation media

Differential gene expression

Significant genes (FDR<0.05)

all_genes_test_full_info[["batch_effect"]] %>% 
  filter(FDR<0.05) %>% 
  transmute(Gene = external_gene_name,
            ENSEMBL_ID = rownames(all_genes_test_full_info),
            description,
            logFC=round(logFC,2),
            PValue=round(PValue,2),
            FDR=round(FDR,5)) %>% 
  DT::datatable(extensions = 'Buttons', filter="top", rownames = FALSE, options = list(
    dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'colvis')))

All genes

all_genes_test_full_info[["batch_effect"]] %>% 
  transmute(Gene = external_gene_name, 
            ENSEMBL_ID = rownames(all_genes_test_full_info),
            description,
            logFC=round(logFC,2),
            PValue=round(PValue,2),
            FDR=round(FDR,5)) %>% 
  DT::datatable(extensions = 'Buttons', filter="top", rownames = FALSE, options = list(
    dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'colvis')))

Volcano-plot

plot_volcano <- function(contrast){
  df <- all_genes_test_full_info[[contrast]]
  allLogFc <- df[["logFC"]]
  minLogFc <- min(allLogFc)
  maxLogFc <- max(allLogFc)
  minPval <- max(-log10(df[["PValue"]]))
  maxPval <- 0
  ggplot(df, aes(x = logFC, 
                 y = -log10(PValue), 
                 colour = FDR < 0.05
  )
  ) + 
    geom_point(size=0.5, alpha=0.2) +
    scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black"), name="Significance", labels = c("TRUE" = "FDR < 0.05", "FALSE" = "FDR ≥ 0.05"), breaks = c("TRUE", "FALSE")) +
    ylab(expression(paste(-log[10], "(P-value)"))) +
    xlab(expression(paste(log[2], "(Fold Change)"))) +
    scale_x_continuous(limits = c(minLogFc, maxLogFc)) +
    scale_y_continuous(limits = c(maxPval, minPval)) +
    theme_bw()
}

plot_volcano("batch_effect")

Gene ontology analysis

Significant GO-terms (FDR<0.5)

ontology_tests[["batch_effect"]] %>% 
  filter(FDR<0.05) %>%
  transmute(ID, 
            Direction=as.factor(Direction), 
            TERM, 
            `#genes`=NGenes, 
            PValue=round(PValue,2),
            FDR=round(FDR, 5)) %>% 
  DT::datatable(extensions = 'Buttons', filter="top", rownames = FALSE, options = list(
    dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'colvis')))

All GO-terms

ontology_tests[["batch_effect"]] %>% 
  transmute(ID, Direction=as.factor(Direction), TERM, `#genes`=NGenes, PValue=round(PValue,2),FDR=round(FDR, 5)) %>% 
  DT::datatable(extensions = 'Buttons', filter="top", rownames = FALSE, options = list(
    dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'colvis')))